set.seed(2021)
library(MouseGastrulationData)
library(ggplot2)
library(reshape)
library(org.Mm.eg.db)
library(scater)
library(patchwork)
Load functions
source("../scripts/StabMap_functions.R")
mt = MouseGastrulationData::AtlasSampleMetadata
samples = mt$sample[mt$stage %in% c("E6.5", "E7.5", "E8.5")]
SCE <- EmbryoAtlasData(samples = samples)
SCE
## class: SingleCellExperiment
## dim: 29452 37551
## metadata(0):
## assays(1): counts
## rownames(29452): ENSMUSG00000051951 ENSMUSG00000089699 ...
## ENSMUSG00000096730 ENSMUSG00000095742
## rowData names(2): ENSEMBL SYMBOL
## colnames(37551): cell_1 cell_2 ... cell_139330 cell_139331
## colData names(17): cell barcode ... colour sizeFactor
## reducedDimNames(2): pca.corrected umap
## altExpNames(0):
SCE <- logNormCounts(SCE)
celltype_colours = MouseGastrulationData::EmbryoCelltypeColours
celltype_colours
## Epiblast Primitive Streak
## "#635547" "#DABE99"
## Caudal epiblast PGC
## "#9E6762" "#FACB12"
## Anterior Primitive Streak Notochord
## "#C19F70" "#0F4A9C"
## Def. endoderm Gut
## "#F397C0" "#EF5A9D"
## Nascent mesoderm Mixed mesoderm
## "#C594BF" "#DFCDE4"
## Intermediate mesoderm Caudal Mesoderm
## "#139992" "#3F84AA"
## Paraxial mesoderm Somitic mesoderm
## "#8DB5CE" "#005579"
## Pharyngeal mesoderm Cardiomyocytes
## "#C9EBFB" "#B51D8D"
## Allantois ExE mesoderm
## "#532C8A" "#8870AD"
## Mesenchyme Haematoendothelial progenitors
## "#CC7818" "#FBBE92"
## Endothelium Blood progenitors 1
## "#FF891C" "#F9DECF"
## Blood progenitors 2 Erythroid1
## "#C9A997" "#C72228"
## Erythroid2 Erythroid3
## "#F79083" "#EF4E22"
## NMP Rostral neurectoderm
## "#8EC792" "#65A83E"
## Caudal neurectoderm Neural crest
## "#354E23" "#C3C388"
## Forebrain/Midbrain/Hindbrain Spinal cord
## "#647A4F" "#CDE088"
## Surface ectoderm Visceral endoderm
## "#F7F79E" "#F6BFCB"
## ExE endoderm ExE ectoderm
## "#7F6874" "#989898"
## Parietal endoderm
## "#1A1A1A"
Randomly subset 5000 cells from this data, for speed reasons
cells_thin = sample(colnames(SCE), 5000)
SCE_thin = SCE[,cells_thin]
SCE_thin
## class: SingleCellExperiment
## dim: 29452 5000
## metadata(0):
## assays(2): counts logcounts
## rownames(29452): ENSMUSG00000051951 ENSMUSG00000089699 ...
## ENSMUSG00000096730 ENSMUSG00000095742
## rowData names(2): ENSEMBL SYMBOL
## colnames(5000): cell_99688 cell_48887 ... cell_47358 cell_42653
## colData names(17): cell barcode ... colour sizeFactor
## reducedDimNames(2): pca.corrected umap
## altExpNames(0):
Account for gene name difference for Cavin3/Prkcdbp. Then convert to the ENSEMBL gene IDs.
seqFISH_gene_symbols = c(scan(what = "character", file = "../data/seqFISH_genes.txt"), "Prkcdbp")
table(seqFISH_gene_symbols %in% rowData(SCE_thin)[,2]) # only one missing, which is accounted for
##
## FALSE TRUE
## 1 351
seqFISH_genes = rownames(SCE_thin)[rowData(SCE_thin)[,2] %in% seqFISH_gene_symbols]
length(seqFISH_genes)
## [1] 351
This will be re-used multiple times. Note this is a dense matrix, as output from igraph::similarity.
full_sim = generateSimilarity(SCE_thin, batchFactor = SCE_thin$sample)
dim(full_sim)
## [1] 5000 5000
full_sim[1:5,1:5]
## cell_99688 cell_48887 cell_138969 cell_47896 cell_137543
## cell_99688 1 0 0 0 0
## cell_48887 0 1 0 0 0
## cell_138969 0 0 1 0 0
## cell_47896 0 0 0 1 0
## cell_137543 0 0 0 0 1
plotAdditional = list("celltype",
list(scale_colour_manual(values = celltype_colours),
theme(legend.position = "bottom")))
scores_seqFISH = getSubsetUncertainty(SCE_thin,
subsetGenes = seqFISH_genes,
full_sim = full_sim,
plot = TRUE,
plotAdditional = plotAdditional)
scores_df = data.frame(cell = colnames(SCE_thin),
score = scores_seqFISH,
celltype = SCE_thin$celltype)
ggplot(scores_df, aes(x = celltype, y = score)) +
geom_boxplot(aes(fill = celltype)) +
theme_classic() +
scale_fill_manual(values = celltype_colours) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
xlab("") +
theme(legend.position = "bottom") +
NULL
I can also estimate the uncertainty for the full data, given the subset. This means that I can estimate the uncertainty for all cells, but only pull from the 5000x5000 similarity distribution from the random subset of cells. This is of course an estimate, but it’s worthwhile knowing one can circumvent needing to calculate the similarity across all cells.
joint_sample = SCE$sample
names(joint_sample) <- colnames(SCE)
scores_all = getSubsetUncertainty(SCE_thin,
querySCE = SCE[, setdiff(colnames(SCE), colnames(SCE_thin))],
subsetGenes = seqFISH_genes,
full_sim = full_sim,
plot = TRUE,
plotAdditional = plotAdditional,
jointBatchFactor = joint_sample)
length(scores_all)
## [1] 37551
head(scores_all)
## cell_99688 cell_48887 cell_138969 cell_47896 cell_137543 cell_49005
## 0.34612245 0.10040816 0.32163265 0.06285714 0.38857143 0.16816327
scores_all_df = data.frame(cell = names(scores_all),
score = scores_all,
celltype = SCE[,names(scores_all)]$celltype,
sample = SCE[,names(scores_all)]$sample,
score_sub = scores_seqFISH[names(scores_all)])
dim(scores_all_df)
## [1] 37551 5
head(scores_all_df)
## cell score celltype sample
## cell_99688 cell_99688 0.34612245 Forebrain/Midbrain/Hindbrain 29
## cell_48887 cell_48887 0.10040816 Blood progenitors 1 19
## cell_138969 cell_138969 0.32163265 Pharyngeal mesoderm 37
## cell_47896 cell_47896 0.06285714 <NA> 19
## cell_137543 cell_137543 0.38857143 Forebrain/Midbrain/Hindbrain 37
## cell_49005 cell_49005 0.16816327 Nascent mesoderm 19
## score_sub
## cell_99688 0.5240816
## cell_48887 0.1779592
## cell_138969 0.5722449
## cell_47896 0.4008163
## cell_137543 0.6424490
## cell_49005 0.5093878
ggplot(scores_all_df, aes(x = score_sub, y = score)) +
geom_point(aes(colour = celltype)) +
theme_classic() +
scale_colour_manual(values = celltype_colours) +
geom_abline(slope = 1, intercept = 0) +
theme(legend.position = "bottom") +
NULL
ggplot(scores_all_df, aes(x = interaction(sample, celltype), y = score)) +
geom_boxplot(aes(fill = celltype)) +
theme_classic() +
scale_fill_manual(values = celltype_colours) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
xlab("") +
theme(legend.position = "bottom") +
NULL
Certain cell types do appear to show a higher level of uncertainty, e.g. Allantois and Epiblast.
Here I perform a clustering of the data using the seqFISH genes, extract the clusters with the highest uncertainty values, then re-cluster these cells using the entire transcriptome. Presumably, the markers for these re-clusters represent the gene expression differences that are as yet unaccounted for within the seqFISH gene set.
SCE_sub <- logNormCounts(SCE[seqFISH_genes,])
fit = modelGeneVar(logcounts(SCE_sub))
HVGs = getTopHVGs(fit)
length(HVGs)
## [1] 181
SCE_sub <- runPCA(SCE_sub, subset_row = HVGs)
SCE_sub_corrected <- fastMNN(SCE_sub, batch = SCE_sub$sample)
PCs_sub = reducedDim(SCE_sub_corrected, "corrected")
reducedDim(SCE_sub, "UMAP") <- calculateUMAP(t(PCs_sub))
cl = clusterSNNGraph(SCE_sub_corrected, use.dimred = "corrected")
table(cl)
## cl
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 2998 194 2221 3065 1549 4432 2885 3120 1195 5258 4583 730 909 1777 1138 149
## 17 18 19 20 21 22 23 24
## 142 32 13 30 640 360 88 43
SCE_sub$cluster_seqFISH = paste0("seqFISH_cluster_", cl)
SCE_sub$score = scores_all_df[colnames(SCE_sub), "score"]
plotUMAP(SCE_sub, colour_by = "cluster_seqFISH") + scale_colour_discrete() + ggtitle("Clusters using seqFISH genes")
plotUMAP(SCE_sub, colour_by = "score")
plotUMAP(SCE_sub, colour_by = "celltype") + scale_colour_manual(values = celltype_colours)
scores_all_df$cluster_seqFISH = SCE_sub[,names(scores_all)]$cluster_seqFISH
ggplot(scores_all_df, aes(x = cluster_seqFISH, y = score)) +
geom_boxplot(aes(fill = cluster_seqFISH)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
NULL
There are clusters that have a high uncertainty value, while others remain fairly low. This means that for these clusters, there may be additional genes that are needed to better estimate the cell-cell neighbourhoods.
Re-cluster the cells within these high uncertainty clusters using all genes, and extract their marker genes. Visualise these on the full UMAP and examine what genes they are.
clusters_ordered_uncertainty = names(sort(tapply(SCE_sub$score, SCE_sub$cluster_seqFISH, median), decreasing = TRUE))
clusters_ordered_uncertainty
## [1] "seqFISH_cluster_21" "seqFISH_cluster_15" "seqFISH_cluster_10"
## [4] "seqFISH_cluster_11" "seqFISH_cluster_1" "seqFISH_cluster_8"
## [7] "seqFISH_cluster_4" "seqFISH_cluster_6" "seqFISH_cluster_14"
## [10] "seqFISH_cluster_2" "seqFISH_cluster_20" "seqFISH_cluster_22"
## [13] "seqFISH_cluster_3" "seqFISH_cluster_18" "seqFISH_cluster_24"
## [16] "seqFISH_cluster_17" "seqFISH_cluster_23" "seqFISH_cluster_5"
## [19] "seqFISH_cluster_13" "seqFISH_cluster_19" "seqFISH_cluster_7"
## [22] "seqFISH_cluster_12" "seqFISH_cluster_9" "seqFISH_cluster_16"
minFDRs_list = list()
for (i in 1:5) {
print(i)
SCE_sub$cluster_seqFISH_tmp = SCE_sub$cluster_seqFISH == clusters_ordered_uncertainty[i]
g = plotUMAP(SCE_sub, colour_by = "cluster_seqFISH_tmp") +
scale_colour_manual(values = c("TRUE" = "red", "FALSE" = "grey"), na.value = "grey") +
ggtitle(paste("Clusters using seqFISH genes on seqFISH genes UMAP, ",
clusters_ordered_uncertainty[i]))
print(g)
g = plotReducedDim(SCE_sub, dimred = "umap", colour_by = "cluster_seqFISH_tmp") +
scale_colour_manual(values = c("TRUE" = "red", "FALSE" = "grey"), na.value = "grey") +
ggtitle(paste("Clusters using seqFISH genes on full data UMAP, ",
clusters_ordered_uncertainty[i])) +
coord_fixed()
print(g)
table(SCE_sub$celltype, SCE_sub$cluster_seqFISH_tmp)
# cluster within these cells
SCE_full = SCE[,colnames(SCE_sub)[SCE_sub$cluster_seqFISH_tmp]]
SCE_full <- logNormCounts(SCE_full)
fit = modelGeneVar(logcounts(SCE_full), block = SCE_full$sample)
HVGs = getTopHVGs(fit)
length(HVGs)
SCE_full <- runPCA(SCE_full, subset_row = HVGs)
SCE_full_corrected <- fastMNN(SCE_full, batch = SCE_full$sample)
SCE_full_clusters = clusterSNNGraph(SCE_full_corrected, use.dimred = "corrected")
table(SCE_full_clusters)
SCE_full$full_cluster = SCE_full_clusters
full_cluster_markers_array = getMarkerArray(SCE_full, "full_cluster")
minFDRs = rowMins(full_cluster_markers_array[,,"FDR"], na.rm = TRUE)
names(minFDRs) <- dimnames(full_cluster_markers_array)[[1]]
minFDRs_list[[i]] <- minFDRs
sort(minFDRs)[1:5]
table(minFDRs < 0.01)
head(minFDRs[minFDRs < 0.01])
rowData(SCE_full)$SYMBOL[order(minFDRs)[1:5]]
gList = sapply(rowData(SCE_full)$SYMBOL[order(minFDRs)[1:9]], function(gene) {
plotReducedDim(SCE, dimred = "umap",
# colour_by = rowData(SCE_full)$SYMBOL[order(minFDRs)[1]],
colour_by = gene,
swap_rownames = "SYMBOL",
by_exprs_values = "logcounts") +
coord_fixed()
}, simplify = FALSE)
g = wrap_plots(gList, nrow = 3, ncol = 3)
print(g)
}
## [1] 1
## [1] "1"
## [1] "2"
## [1] "3"
## [1] "4"
## [1] "5"
## [1] "6"
## [1] 2
## [1] "1"
## [1] "2"
## [1] "3"
## [1] "4"
## [1] "5"
## [1] "6"
## [1] 3
## [1] "1"
## [1] "2"
## [1] "3"
## [1] "4"
## [1] "5"
## [1] "6"
## [1] "7"
## [1] "8"
## [1] "9"
## [1] 4
## [1] "1"
## [1] "2"
## [1] "3"
## [1] "4"
## [1] "5"
## [1] "6"
## [1] "7"
## [1] "8"
## [1] "9"
## [1] "10"
## [1] "11"
## [1] "12"
## [1] "13"
## [1] "14"
## [1] 5
## [1] "1"
## [1] "2"
## [1] "3"
## [1] "4"
## [1] "5"
## [1] "6"
## [1] "7"
## [1] "8"
## [1] "9"
## [1] "10"
## [1] "11"
## [1] "12"
Overall it appears that cell cycle genes are lacking, as well as some relevant genes for NMPs, e.g. Hes1, Nkx1-2. A decent number of these genes are also in the smFISH set, so it’s worth seeing what difference these genes make and to which cells in terms of their neighbourhood similarities.
sessionInfo()
## R version 4.0.4 (2021-02-15)
## Platform: x86_64-apple-darwin19.6.0 (64-bit)
## Running under: macOS Catalina 10.15.7
##
## Matrix products: default
## BLAS: /usr/local/Cellar/openblas/0.3.13/lib/libopenblasp-r0.3.13.dylib
## LAPACK: /usr/local/Cellar/r/4.0.4/lib/R/lib/libRlapack.dylib
##
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] BiocNeighbors_1.8.2 batchelor_1.6.2
## [3] igraph_1.2.6 bluster_1.0.0
## [5] scran_1.18.3 patchwork_1.1.1
## [7] scater_1.18.3 org.Mm.eg.db_3.12.0
## [9] AnnotationDbi_1.52.0 reshape_0.8.8
## [11] ggplot2_3.3.3 MouseGastrulationData_1.4.0
## [13] SingleCellExperiment_1.12.0 SummarizedExperiment_1.20.0
## [15] Biobase_2.50.0 GenomicRanges_1.42.0
## [17] GenomeInfoDb_1.26.2 IRanges_2.24.1
## [19] S4Vectors_0.28.1 BiocGenerics_0.36.0
## [21] MatrixGenerics_1.2.0 matrixStats_0.57.0
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-6 bit64_4.0.5
## [3] RcppAnnoy_0.0.18 httr_1.4.2
## [5] tools_4.0.4 ResidualMatrix_1.0.0
## [7] R6_2.5.0 irlba_2.3.3
## [9] vipor_0.4.5 uwot_0.1.10
## [11] DBI_1.1.1 colorspace_2.0-0
## [13] withr_2.4.1 tidyselect_1.1.0
## [15] gridExtra_2.3 bit_4.0.4
## [17] curl_4.3 compiler_4.0.4
## [19] DelayedArray_0.16.1 labeling_0.4.2
## [21] scales_1.1.1 rappdirs_0.3.3
## [23] stringr_1.4.0 digest_0.6.27
## [25] rmarkdown_2.6 XVector_0.30.0
## [27] pkgconfig_2.0.3 htmltools_0.5.1.1
## [29] sparseMatrixStats_1.2.0 limma_3.46.0
## [31] dbplyr_2.1.0 fastmap_1.1.0
## [33] rlang_0.4.10 RSQLite_2.2.3
## [35] shiny_1.6.0 DelayedMatrixStats_1.12.2
## [37] farver_2.0.3 generics_0.1.0
## [39] BiocParallel_1.24.1 dplyr_1.0.3
## [41] RCurl_1.98-1.2 magrittr_2.0.1
## [43] BiocSingular_1.6.0 GenomeInfoDbData_1.2.4
## [45] scuttle_1.0.4 Matrix_1.3-2
## [47] Rcpp_1.0.6 ggbeeswarm_0.6.0
## [49] munsell_0.5.0 viridis_0.5.1
## [51] lifecycle_0.2.0 edgeR_3.32.1
## [53] stringi_1.5.3 yaml_2.2.1
## [55] zlibbioc_1.36.0 plyr_1.8.6
## [57] BiocFileCache_1.14.0 AnnotationHub_2.22.0
## [59] grid_4.0.4 blob_1.2.1
## [61] dqrng_0.2.1 promises_1.1.1
## [63] ExperimentHub_1.16.0 crayon_1.3.4
## [65] lattice_0.20-41 cowplot_1.1.1
## [67] beachmat_2.6.4 locfit_1.5-9.4
## [69] knitr_1.30 pillar_1.4.7
## [71] codetools_0.2-18 glue_1.4.2
## [73] BiocVersion_3.12.0 evaluate_0.14
## [75] BiocManager_1.30.10 vctrs_0.3.6
## [77] httpuv_1.5.5 gtable_0.3.0
## [79] purrr_0.3.4 assertthat_0.2.1
## [81] cachem_1.0.1 xfun_0.20
## [83] rsvd_1.0.3 mime_0.9
## [85] xtable_1.8-4 RSpectra_0.16-0
## [87] later_1.1.0.1 viridisLite_0.3.0
## [89] tibble_3.0.5 beeswarm_0.2.3
## [91] memoise_2.0.0 statmod_1.4.35
## [93] ellipsis_0.3.1 interactiveDisplayBase_1.28.0